In [ ]:
# imports
import pandas as pd
import plotly.express as px
import plotly.offline as pyo

# Set notebook mode to work in offline
pyo.init_notebook_mode()
In [ ]:
# reading the CSV files
deal_dim = pd.read_csv('../data/deal_dim.csv')
deal_fact = pd.read_csv('../data/deal_fact.csv')
In [ ]:
# checking data types and number of rows
deal_dim.info()
deal_fact.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   DEAL_ID                      2994 non-null   int64  
 1   COMPANY_ID_(DEAL)            2994 non-null   object 
 2   DEAL_PIPELINE                2994 non-null   object 
 3   BUSINESS_TYPE_(DEAL)         2994 non-null   object 
 4   CURRENT_DEAL_STAGE           2994 non-null   object 
 5   CUSTOMER_SEGMENT_(DEAL)      2993 non-null   object 
 6   CURRENT_DEAL_AMOUNT          2903 non-null   float64
 7   OPPORTUNITY_DATE_(DEAL)      2994 non-null   object 
 8   BUSINESS_CASE_PROPOSAL_DATE  1752 non-null   object 
 9   NEGOTIATION_DATE             1122 non-null   object 
 10  VERBAL_AGREEMENT_DATE        522 non-null    object 
 11  BOOKING_DATE                 647 non-null    object 
 12  CLOSE_DATE                   2900 non-null   object 
 13  CLOSED_LOST_DATE             1805 non-null   object 
 14  SALES_CYCLE                  647 non-null    float64
 15  LEGAL_CYCLE                  454 non-null    float64
 16  MARKET_(DEAL)                2994 non-null   object 
 17  COUNTRY_(DEAL)               2972 non-null   object 
 18  DEAL_TYPE                    2994 non-null   object 
 19  SOURCED_BY_(DEAL)            2987 non-null   object 
 20  LOST_REASON                  1788 non-null   object 
 21  LOST_TO_COMPETITOR           1133 non-null   object 
 22  FORECAST_PROBABILITY         2850 non-null   float64
 23  BOOKINGS_FORECAST            2903 non-null   float64
dtypes: float64(5), int64(1), object(18)
memory usage: 561.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7781 entries, 0 to 7780
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DEAL_ID                            7781 non-null   int64  
 1   COMPANY_ID_NUMERIC_DEAL_DATE_FACT  7781 non-null   object 
 2   DEAL_STAGE_DATE                    7772 non-null   object 
 3   OPPORTUNITY_(DEAL)                 2974 non-null   float64
 4   OPPORTUNITY_AMOUNT                 2883 non-null   float64
 5   BUSINESS_CASE_PROPOSAL             1747 non-null   float64
 6   BUSINESS_CASE_PROPOSAL_AMOUNT      1747 non-null   float64
 7   DEAL_STAGE_NEGOTIATION_DATE        1121 non-null   object 
 8   NEGOTIATION                        1121 non-null   float64
 9   NEGOTIATION_AMOUNT                 1121 non-null   float64
 10  DEAL_STAGE_VERBAL_AGREEMENT_DATE   522 non-null    object 
 11  VERBAL_AGREEMENT                   522 non-null    float64
 12  VERBAL_AGREEMENT_AMOUNT            522 non-null    float64
 13  DEAL_STAGE_CLOSED_LOST_DATE        1785 non-null   object 
 14  CLOSED_LOST                        1785 non-null   float64
 15  CLOSED_LOST_AMOUNT                 1785 non-null   float64
 16  DEAL_ID_BOOKING_FACT               647 non-null    float64
 17  BOOKING                            647 non-null    float64
 18  BOOKING_AMOUNT                     647 non-null    float64
dtypes: float64(13), int64(1), object(5)
memory usage: 1.1+ MB
In [ ]:
# i) Number of opportunities over time (monthly)
deal_dim['OPPORTUNITY_MONTH'] = pd.to_datetime(deal_dim['OPPORTUNITY_DATE_(DEAL)']).dt.strftime("%Y-%m")
number_of_opportunities_per_month = deal_dim.groupby('OPPORTUNITY_MONTH')['DEAL_ID'].count()

# plotting the graph
fig = px.line(number_of_opportunities_per_month, title='Number of opportunities per month',
              labels={
                  'value': 'Number of opportunities',
                  'OPPORTUNITY_MONTH': 'Month'
              }
             )
fig.update_layout(showlegend=False)
pyo.iplot(fig, filename = 'number_of_opportunities_per_month')
In [ ]:
# ii) Total opportunity amount and booking amount over time on a monthly basis in one graph.
# I used BUSINESS_CASE_PROPOSAL_AMOUNT instead of OPPORTUNITY_AMOUNT because data was missing
deal_fact['DEAL_STAGE_MONTH'] = pd.to_datetime(deal_fact['DEAL_STAGE_DATE']).dt.strftime("%Y-%m")
total_amount_per_month = deal_fact.groupby('DEAL_STAGE_MONTH')['BUSINESS_CASE_PROPOSAL_AMOUNT','BOOKING_AMOUNT'].sum()

# plotting the graph
fig = px.line(total_amount_per_month, title='Total amount per month',
              labels={
                  'value': 'Total Amount $',
                  'DEAL_STAGE_MONTH': 'Month'
              }
             )
pyo.iplot(fig, filename = 'total_amount_per_month')
/var/folders/wf/z578m42550bgsqj4zdbtlxjw0000gn/T/ipykernel_37836/2921940191.py:4: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

In [ ]:
# iii) Average Sales cycle time over time
avg_sales_cycle_per_month = deal_dim.groupby('OPPORTUNITY_MONTH')['SALES_CYCLE'].mean()

# plotting the graph
fig = px.line(avg_sales_cycle_per_month, title='Average sales cycle per month',
              labels={
                  'value': 'Avg. Sales cycle',
                  'OPPORTUNITY_MONTH': 'Month'
              }
             )
fig.update_layout(showlegend=False)
pyo.iplot(fig, filename = 'avg_sales_cycle_per_month')